#let us start by importing the relevant libraries
#Import all the necessary modules
import pandas as pandas
import numpy as numpy
import os
import matplotlib.pyplot as matplot
%matplotlib inline
import seaborn as sns
from Custom import Perform_EDA as EDA
import itertools
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,roc_auc_score
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
Source= pd.read_csv("vehicle-1.csv")
#Understand the data set
#Data skimmed through to see what are the variables present, data type, shape, column names, mixed data types,
#missing values etc
Source.head(10)
Source.info()
Source.shape
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
columns = Source.columns
#Let's Label Encode our class variable:
print(columns)
Source['class'] = le.fit_transform(Source['class'])
Source.shape
#Since the variable is categorical, you can use value_counts function
pandas.value_counts(Source['class'])
import matplotlib.pyplot as plt
%matplotlib inline
pandas.value_counts(Source["class"]).plot(kind="bar")
pandas.value_counts(Source['class']).hist(bins=300)
Source.isna().sum()
from sklearn.impute import SimpleImputer
newSource = Source.copy()
#interest_Source = newSource.drop('class', axis=1)
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1)
#fill missing values with mean column values
transformed_values = imputer.fit_transform(newSource)
column = newSource.columns
print(column)
newdf = pd.DataFrame(transformed_values, columns = column )
newdf.describe()
newdf.isna().sum()
from scipy.stats import zscore
interest_df_z = newdf.apply(zscore)
interest_df_z.head()
newdf.describe().T
#Quick Insights On descriptive stats:
# Compactness has mean and median values almost similar , it signifies that it is normally distribited
# and has no skewness/outlier
# circularity : it also seems to be normally distribted as mean amd median has similar values
# scatter_ratio, Scaled variance 1 & 2 feature seems to be having some kind of skewness and outlier
newdf.shape
plt.style.use('seaborn-whitegrid')
newdf.hist(bins=20, figsize=(60,40), color='lightblue', edgecolor = 'red')
plt.show()
Most of the data attributes seems to be normally distributed scaled valriance 1 and skewness about 1 and 2, scatter_ratio, seems to be right skwed . pr.axis_rectangularity seems to be hanging outliers as there are some gaps found in the bar plot
#Let us use seaborn distplot to analyze the distribution of our columns and see the skewness in attributes
f, ax = plt.subplots(1, 6, figsize=(30,5))
vis1 = sns.distplot(newdf["scaled_variance.1"],bins=10, ax= ax[0])
vis2 = sns.distplot(newdf["scaled_variance"],bins=10, ax=ax[1])
vis3 = sns.distplot(newdf["skewness_about.1"],bins=10, ax= ax[2])
vis4 = sns.distplot(newdf["skewness_about"],bins=10, ax=ax[3])
vis6 = sns.distplot(newdf["scatter_ratio"],bins=10, ax=ax[5])
f.savefig('subplot.png')
skewValue = newdf.skew()
print("skewValue of dataframe attributes:\n\n",skewValue)
#Summary View of all attribute , The we will look into all the boxplot individually to trace out outliers
ax = sns.boxplot(data=newdf, orient="h")
EDA.univariate_plots(newdf)
# Observation on boxplots:
# pr.axis_aspect_ratio, skewness_about, max_length_aspect_ratio, skewness_about_1,
# scaled_radius_of_gyration.1, scaled_variance.1, radius_ratio, skewness_about,
# scaled_variance.1 are some of the attributes with outliers.
# which is visible with all dotted points
# Treating Outliers Using IQR: Upper whisker
# The interquartile range (IQR), also called the midspread or middle 50%, or technically H-spread,
# is a measure of statistical dispersion, being equal to the difference between 75th and 25th percentiles,
# or between upper and lower quartiles, IQR = Q3 − Q1.
newdf.shape
from scipy.stats import iqr
Q1 = newdf.quantile(0.25)
Q3 = newdf.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
cleandf = newdf[~((newdf < (Q1 - 1.5 * IQR)) | (newdf > (Q3 + 1.5 * IQR))).any(axis=1)]
cleandf.shape
EDA.univariate_plots(cleandf)
# We can see that all out boxplot for all the attributes which had outlier have been treate and removed.
# Since no. of outliers were less we opted to remove it.
# Generally we avoid this as it can lead to info loss in case of large data sets with large no of outliers
# Pearson Correlation Coefficient:
# We will use Pearson Correlation Coefficient to see what all attributes are linearly related and also
# visualize the same in the seaborns scatter plot.
cleandf= newdf.drop('class', axis=1)
EDA.EDA_Corr(cleandf)
#compare all attributes visually to check for relationships that can be exploited
sns.pairplot(cleandf, diag_kind="kde")
# Scaled Variance & Scaled Variance.1 seems to be have very strong positive correlation with value of 0.98.
# skewness_about_2 and hollow_ratio also seems to have strong positive correation with coeff: 0.89
# scatter_ratio and elongatedness seems to be have very strong negative correlation.
# elongatedness and pr.axis_rectangularity seems to have strong negative correlation.
# From above correlation matrix we can see that there are many features which are highly correlated.
# If we carefully analyse, we will find that many features are there which having more than 0.9 correlation.
# so we can decide to get rid of those columns whose correlation is +-0.9 or above.T
# There are 8 such columns:
# max.length_rectangularity
# scaled_radius_of_gyration
# skewness_about.2
# scatter_ratio
# elongatedness
# pr.axis_rectangularity
# scaled_variance
# scaled_variance.1
#display how many are car,bus,van.
sns.countplot(newdf["class"])
# Basically PCA is a dimension redcuction methodology which aims to reduce a large set of (often correlated)
# variables into a smaller set of (uncorrelated) variables, called principal components, which holds sufficient
# information without loosing the the relevant info much.
# Principal components are new variables that are constructed as linear combinations or mixtures of the
# initial variables. These combinations are done in such a way that the new variables (i.e., principal components)
# are uncorrelated and most of the information within the initial variables is squeezed or compressed into the
# first components.
# Lets perform PCA in following steps:
# 1. Split our data into train and test data set
# 2. Normalize the tarining set using standard scalar
# 3. Calculate the covariance matrix.
# 4. Calculate the eigenvectors and their eigenvalues.
# 5. Sort the eigenvectors according to their eigenvalues in descending order.
# 6. Choose the first K eigenvectors (where k is the dimension we'd like to end up with).
# 7. Build new dataset with reduced dimensionality.
# Separate The Data Into Independent & Dependent attribute
X = newdf.iloc[:,0:18].values
y = newdf.iloc[:,18].values
X
# Scaling The Independent Data Set
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_std = sc.fit_transform(X)
X_std
# Calculating covariance matrix
cov_matrix = numpy.cov(X_std.T)
print("cov_matrix shape:",cov_matrix.shape)
print("Covariance_matrix",cov_matrix)
# Calculating Eigen Vectors & Eigen Values: Using numpy linear algebra function
eigenvalues, eigenvectors = numpy.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eigenvectors)
print('\n Eigen Values \n%s', eigenvalues)
# Find variance and cumulative variance by each eigen vector
tot = sum(eigenvalues)
var_exp = [( i /tot ) * 100 for i in sorted(eigenvalues, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
plt.plot(var_exp)
# Ploting
plt.figure(figsize=(10 , 5))
plt.bar(range(1, eigenvalues.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eigenvalues.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
# From above plot we can clearly observer that 8 dimension() are able to explain 95 %variance of data.
# so we will use first 8 principal components going forward and calulate the reduced dimensions.
# Make a set of (eigenvalue, eigenvector) pairs:
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eig_pairs.sort()
eig_pairs.reverse()
print(eig_pairs)
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
# Let's confirm our sorting worked, print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
# P_reduce represents reduced mathematical space....
P_reduce = np.array(eigvectors_sorted[0:8]) # Reducing from 18 to 8 dimension space
X_std_8D = np.dot(X_std, P_reduce.T) # projecting original data into principal component dimensions
reduced_pca = pd.DataFrame(X_std_8D) # converting array to dataframe for pairplot
reduced_pca
sns.pairplot(reduced_pca, diag_kind='kde')
# After dimensionality reduction using PCA our attributes have become independent with no correlation
# among themselves. As most of them have cloud of data points with no lienaer kind of relationship.
# Let's build two Support Vector Classifier Model one with 18 original independent variables and
# the second one with only the 8 new reduced variables constructed using PCA.
#now split the data into 70:30 ratio
#orginal Data
Orig_X_train,Orig_X_test,Orig_y_train,Orig_y_test = train_test_split(X_std, y, test_size=0.30, random_state=1)
#PCA Data
pca_X_train,pca_X_test,pca_y_train,pca_y_test = train_test_split(reduced_pca, y, test_size=0.30, random_state=1)
# Fitting SVC model On Original Data
svc = SVC() #instantiate the object
# fit the model on orighinal raw data
svc.fit(Orig_X_train, Orig_y_train)
# predict the y value
Orig_y_predict = svc.predict(Orig_X_test)
# Fitting SVC ON PCA Data
svc1 = SVC() #instantiate the object
svc1.fit(pca_X_train, pca_y_train)
#predict the y value
pca_y_predict = svc1.predict(pca_X_test)
# Display accuracy score of both models
SVC_Org_Score = svc.score(Orig_X_test, Orig_y_test)
print("Model Score On Original Data ", SVC_Org_Score)
SVC_PCA8_Score = svc1.score(pca_X_test, pca_y_test)
print("Model Score On Reduced PCA Dimension ",SVC_PCA8_Score)
SVC_Org_Accuracy = accuracy_score(Orig_y_test, Orig_y_predict)
print("Before PCA On Original 18 Dimension", SVC_Org_Accuracy)
SVC_PCA8_Accuracy = accuracy_score(pca_y_test, pca_y_predict)
print("After PCA (On 8 dimension)", SVC_PCA8_Accuracy)
SVC_Org_CrossScore = cross_val_score(svc, X_std, y, cv=10, scoring='accuracy').mean()
print('Cross Validation Score [Scaled]', SVC_Org_CrossScore)
SVC_PCA8_CrossScore = cross_val_score(svc1, reduced_pca, y, cv=10, scoring='accuracy').mean()
print('Cross Validation Score [PCA]', SVC_PCA8_CrossScore)
# Observations
# On training data set we saw that our support vector classifier without performing PCA has an
# accuracy score of 95 %
# But when we applied the SVC model on PCA componenets(reduced dimensions) our model scored 93 %.
# Considering that original dataframe had 18 dimensions and After PCA dimension reduced to 8,
# our model has fared well in terms of accuracy score.
# Calculate Confusion Matrix & PLot To Visualize it
def draw_confmatrix(y_test, yhat, str1, str2, str3, datatype ):
#Make predictions and evalute
cm = confusion_matrix( y_test, yhat, [0,1,2] )
print("Confusion Matrix For :", "\n",datatype, cm )
sns.heatmap(cm, annot=True, cmap='Blues',fmt='g', xticklabels = [str1, str2,str3] , yticklabels = [str1, str2,str3] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
draw_confmatrix(Orig_y_test, Orig_y_predict,"Van ", "Car ", "Bus", "Original Data Set\n" )
draw_confmatrix(pca_y_test, pca_y_predict,"Van ", "Car ", "Bus", "For Reduced Dimensions Using PCA\n")
# Confusion Metric Analysis ON Original Data
#
# Original Data Set [[ 58 0 1] [ 1 129 3] [ 6 1 55]]
#
# Our model on original data set has correctly classified 58 van out of 59 actuals vans and
# has errored only in one case where it has wrongly predicted van to be a bus.
#
# In case of 133 actual cars our svm model has correcly classified 129 cars. it has wrongly classified 3 cars
# to be a bus and also 1 car to be a van.
#
# In case of 62 instances of actual bus , our model has correctly classified 55 buses , It has faltered in
# classifying wrongly 6 buses to be a van and 1 bus to be a car
# Confusion Metric Analysis ON Reduced Dimesnion After PCA
#
# For Reduced Dimensions Using PCA[[ 57 2 0][ 2 126 5][ 1 7 54]]
#
# Out of 59 actual instances of vans our model has correctly predicted 57 vans and
# errored in 2 instances where it wrongly classified vans to be a car.
#
# Out of 133 actuals cars , our mdoel has correclty classified 126 of them to be a car and
# faltered in 7 cases where it wrongly classified 5 cars to a bus and 2 cars to be a van.
#
# Out of 62 actual bus, our model has correclty classified 54 of them to be a bus.
# It has faltered in 8 cases where it wrongly classified 7 bus to be a car and 1 bus to be a van.
#Classification Report Of Model built on Raw Data
print("Classification Report For Raw Data:", "\n", classification_report(Orig_y_test, Orig_y_predict))
#Classification Report Of Model built on Principal Components:
print("Classification Report For PCA:","\n", classification_report(pca_y_test, pca_y_predict))
# On original data
# our model has 99 % precision score when it comes to classify car from the given set of silhoutte parameters.
# It has 89 % precision when it comes to classifying the input as van,
# while it has 93 % precison when it come to predict data as bus
# On Reduced Dimensions After PCA
# Our model has highest precision score of 95 % when it comes to predict van type,
# which is better as compared to predcition done on original data set,
# which came out with the precision score of 89 % for van.
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(Orig_X_train, Orig_y_train)
Logistic_Org_Score = model.score(Orig_X_test, Orig_y_test)
print ('Before PCA score', Logistic_Org_Score)
model.fit(pca_X_train, pca_y_train)
Logistic_PCA_Score = model.score(pca_X_test, pca_y_test)
print ('After PCA score', Logistic_PCA_Score)
Logistic_Org_CrossScore = cross_val_score(model, X_std, y, cv=10, scoring='accuracy').mean()
print('Cross Validation Score [Scaled]', Logistic_Org_CrossScore)
Logistic_PCA8_CrossScore = cross_val_score(model, reduced_pca, y, cv=10, scoring='accuracy').mean()
print('Cross Validation Score [PCA]', Logistic_PCA8_CrossScore)
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(Orig_X_train, Orig_y_train)
NB_Org_Score = nb.score(Orig_X_test, Orig_y_test)
print ('Before PCA score', NB_Org_Score)
nb.fit(pca_X_train, pca_y_train)
NB_PCA8_Score = nb.score(pca_X_test, pca_y_test)
print ('After PCA score', NB_PCA8_Score)
NB_Org_CrossScore = cross_val_score(nb, X_std, y, cv=10, scoring='accuracy').mean()
print('Cross Validation Score [Scaled]', NB_Org_CrossScore)
NB_PCA8_CrossScore = cross_val_score(nb, reduced_pca, y, cv=10, scoring='accuracy').mean()
print('Cross Validation Score [PCA]', NB_PCA8_CrossScore)
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion = 'entropy' )
dt_model.fit(Orig_X_train, Orig_y_train)
DT_Org_Score = dt_model.score(Orig_X_test, Orig_y_test)
print ('Before PCA score', DT_Org_Score)
dt_model.fit(pca_X_train, pca_y_train)
DT_PCA8_Score = dt_model.score(pca_X_test, pca_y_test)
print ('After PCA score', DT_PCA8_Score)
DT_Org_CrossScore = cross_val_score(dt_model, X_std, y, cv=10, scoring='accuracy').mean()
print('Cross Validation Score [Scaled]', DT_Org_CrossScore)
DT_PCA8_CrossScore = cross_val_score(dt_model, reduced_pca, y, cv=10, scoring='accuracy').mean()
print('Cross Validation Score [PCA]', DT_PCA8_CrossScore)
Compare = [[SVC_Org_Accuracy, SVC_Org_CrossScore],
[SVC_PCA8_Accuracy, SVC_PCA8_CrossScore],
[Logistic_Org_Score, Logistic_Org_CrossScore],
[Logistic_PCA_Score, Logistic_PCA8_CrossScore],
[NB_Org_Score, NB_Org_CrossScore],
[NB_PCA8_Score, NB_PCA8_CrossScore],
[DT_Org_Score, DT_Org_CrossScore],
[DT_PCA8_Score, DT_PCA8_CrossScore]]
Compare_Models = pandas.DataFrame(Compare, columns= ['Accuracy','Cross Val Score'], index=['SVC Raw','SVC PCA 8', 'Logistic Raw', 'Logistic PCA 8',
'Naive Bayes Raw','Naive Bayes PCA 8', 'DecisionTree Raw', 'DecisionTree PCA 8'])
Compare_Models
# It Seems that Support Vectore Classifier is a better model to classifiy the given
# silhoutte info as van, bus, car
# on Original data
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
num_folds = 50
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = LogisticRegression()
results = cross_val_score(model, X_std, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
# Checking for 95% covering which is 2 * Standard Diviation
93.382 - 2 * 5.611
93.382 + 2 * 5.611
# On PCA Data / 8 Dimension
kfold = KFold(n_splits=50, random_state=seed)
model = LogisticRegression()
results = cross_val_score(model, reduced_pca, y, cv=kfold)
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
# Checking for 95% covering which is 2 * Standard Diviation
82.640 - 2 * 9.851
82.640 + 2 * 9.851
# Using scikit learn PCA here. It does all the above steps and maps data to PCA dimensions in one shot
from sklearn.decomposition import PCA
# NOTE - we are generating only 8 PCA dimensions (dimensionality reduction from 18 to 8)
pca = PCA(n_components=8)
pca.fit(X_std)
X_pca = pca.transform(X_std)
print("original shape: ", X_std.shape)
print("transformed shape:", X_pca.shape)
pca.explained_variance_
pca.components_
df_comp = pd.DataFrame(pca.components_, columns=list(cleandf))
df_comp.head()
plt.figure(figsize=(12,6))
sns.heatmap(df_comp,cmap='plasma',)
# independant variables
X_pca = newdf.drop(['class'], axis=1)
# the dependent variable
y_pca = newdf[['class']]
sns.pairplot(X_pca, diag_kind='kde') # to plot density curve instead of histogram on the diag
from scipy.stats import zscore
XScaled = X_pca.apply(zscore)
XScaled.head()
covMatrix = np.cov(XScaled, rowvar = False)
print(covMatrix)
pca = PCA(n_components=18)
pca.fit(XScaled)
print(pca.explained_variance_)
print(pca.components_)
print(pca.explained_variance_ratio_)
plt.bar(list(range(1,19)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,19)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
pca8 = PCA(n_components = 8)
pca8.fit(XScaled)
print(pca8.components_)
print(pca8.explained_variance_ratio_)
Xpca8 = pca8.transform(XScaled)
sns.pairplot(pd.DataFrame(Xpca8))
# Lets construct two linear models. The first with all the 18 independent variables and the second with
# only the 8 new variables constructed using PCA.
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
regression_model.fit(XScaled, y_pca)
print ('Before PCA score', regression_model.score(XScaled, y_pca))
regression_model_pca = LinearRegression()
regression_model_pca.fit(Xpca8, y_pca)
print ('After PCA score', regression_model_pca.score(Xpca8, y_pca))
model = LogisticRegression()
model.fit(XScaled, y_pca)
print ('Before PCA score', model.score(XScaled, y_pca))
model.fit(Xpca8, y_pca)
print ('After PCA score', model.score(Xpca8, y_pca))
print('Cross Validation Score [Scaled]', cross_val_score(model, XScaled, y, cv=10, scoring='accuracy').mean())
print('Cross Validation Score [PCA]', cross_val_score(model, Xpca8, y, cv=10, scoring='accuracy').mean())
nb = GaussianNB()
nb.fit(XScaled, y)
print ('Before PCA score', nb.score(XScaled, y_pca))
nb.fit(Xpca8, y)
print ('After PCA score', nb.score(Xpca8, y_pca))
print('Cross Validation Score [Scaled]', cross_val_score(nb, XScaled, y, cv=10, scoring='accuracy').mean())
print('Cross Validation Score [PCA]', cross_val_score(nb, Xpca8, y, cv=10, scoring='accuracy').mean())
dt_model = DecisionTreeClassifier(criterion = 'entropy' )
dt_model.fit(XScaled, y_pca)
print ('Before PCA score', dt_model.score(XScaled, y))
dt_model.fit(Xpca8, y_pca)
print ('After PCA score', dt_model.score(Xpca8, y))
print('Cross Validation Score [Scaled]', cross_val_score(dt_model, XScaled, y, cv=10, scoring='accuracy').mean())
print('Cross Validation Score [PCA]', cross_val_score(dt_model, Xpca8, y, cv=10, scoring='accuracy').mean())